In [1]:
import os
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
from lightgbm import LGBMRegressor
import joblib
In [2]:
sales = pd.read_csv('sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('sell_prices.csv')
prices.name = 'prices'
In [3]:
for d in range(1942,1970):
    col = 'd_' + str(d)
    sales[col] = 0
    sales[col] = sales[col].astype(np.int16)  #add sales infoemation from d1942 to d1969 which set as 0
In [8]:
sales
Out[8]:
id item_id dept_id cat_id store_id state_id d_1 d_2 d_3 d_4 ... d_1960 d_1961 d_1962 d_1963 d_1964 d_1965 d_1966 d_1967 d_1968 d_1969
0 HOBBIES_1_001_CA_1_evaluation HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 HOBBIES_1_002_CA_1_evaluation HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 HOBBIES_1_003_CA_1_evaluation HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 HOBBIES_1_004_CA_1_evaluation HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 HOBBIES_1_005_CA_1_evaluation HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
30485 FOODS_3_823_WI_3_evaluation FOODS_3_823 FOODS_3 FOODS WI_3 WI 0 0 2 2 ... 0 0 0 0 0 0 0 0 0 0
30486 FOODS_3_824_WI_3_evaluation FOODS_3_824 FOODS_3 FOODS WI_3 WI 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
30487 FOODS_3_825_WI_3_evaluation FOODS_3_825 FOODS_3 FOODS WI_3 WI 0 6 0 2 ... 0 0 0 0 0 0 0 0 0 0
30488 FOODS_3_826_WI_3_evaluation FOODS_3_826 FOODS_3 FOODS WI_3 WI 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
30489 FOODS_3_827_WI_3_evaluation FOODS_3_827 FOODS_3 FOODS WI_3 WI 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

30490 rows × 1975 columns

In [4]:
sales_bd = np.round(sales.memory_usage().sum()/(1024*1024),1)
calendar_bd = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_bd = np.round(prices.memory_usage().sum()/(1024*1024),1)
In [5]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)
In [6]:
sales_ad = np.round(sales.memory_usage().sum()/(1024*1024),1)
calendar_ad = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_ad = np.round(prices.memory_usage().sum()/(1024*1024),1)
In [7]:
dic = {'DataFrame':['sales','calendar','prices'],
       'Before downcasting':[sales_bd,calendar_bd,prices_bd],
       'After downcasting':[sales_ad,calendar_ad,prices_ad]}

memory = pd.DataFrame(dic)
memory = pd.melt(memory, id_vars='DataFrame', var_name='Status', value_name='Memory (MB)')
memory.sort_values('Memory (MB)',inplace=True)
fig = px.bar(memory, x='DataFrame', y='Memory (MB)', color='Status', barmode='group', text='Memory (MB)')
fig.update_traces(texttemplate='%{text} MB', textposition='outside')
fig.update_layout(template='seaborn', title='Effect of Downcasting')
fig.show()
In [8]:
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()
In [9]:
df
Out[9]:
id item_id dept_id cat_id store_id state_id d sold
0 HOBBIES_1_001_CA_1_evaluation HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA d_1 0
1 HOBBIES_1_002_CA_1_evaluation HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA d_1 0
2 HOBBIES_1_003_CA_1_evaluation HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA d_1 0
3 HOBBIES_1_004_CA_1_evaluation HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA d_1 0
4 HOBBIES_1_005_CA_1_evaluation HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA d_1 0
... ... ... ... ... ... ... ... ...
60034805 FOODS_3_823_WI_3_evaluation FOODS_3_823 FOODS_3 FOODS WI_3 WI d_1969 0
60034806 FOODS_3_824_WI_3_evaluation FOODS_3_824 FOODS_3 FOODS WI_3 WI d_1969 0
60034807 FOODS_3_825_WI_3_evaluation FOODS_3_825 FOODS_3 FOODS WI_3 WI d_1969 0
60034808 FOODS_3_826_WI_3_evaluation FOODS_3_826 FOODS_3 FOODS WI_3 WI d_1969 0
60034809 FOODS_3_827_WI_3_evaluation FOODS_3_827 FOODS_3 FOODS WI_3 WI d_1969 0

60034810 rows × 8 columns

In [10]:
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') 
In [11]:
df
Out[11]:
id item_id dept_id cat_id store_id state_id d sold date wm_yr_wk ... month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price
0 HOBBIES_1_001_CA_1_evaluation HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
1 HOBBIES_1_002_CA_1_evaluation HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
2 HOBBIES_1_003_CA_1_evaluation HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
3 HOBBIES_1_004_CA_1_evaluation HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
4 HOBBIES_1_005_CA_1_evaluation HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 FOODS_3_823_WI_3_evaluation FOODS_3_823 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 2.980469
60034806 FOODS_3_824_WI_3_evaluation FOODS_3_824 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 2.480469
60034807 FOODS_3_825_WI_3_evaluation FOODS_3_825 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 3.980469
60034808 FOODS_3_826_WI_3_evaluation FOODS_3_826 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 1.280273
60034809 FOODS_3_827_WI_3_evaluation FOODS_3_827 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 1.000000

60034810 rows × 22 columns

In [12]:
#remove the information of 12-25 of each year
df=df[-(df['date']== "2011-12-25")&-(df['date']== "2012-12-25")&-(df['date']== "2013-12-25")&-(df['date']== "2014-12-25")&-(df['date']== "2015-12-25")]
In [13]:
df
Out[13]:
id item_id dept_id cat_id store_id state_id d sold date wm_yr_wk ... month year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price
0 HOBBIES_1_001_CA_1_evaluation HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
1 HOBBIES_1_002_CA_1_evaluation HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
2 HOBBIES_1_003_CA_1_evaluation HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
3 HOBBIES_1_004_CA_1_evaluation HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
4 HOBBIES_1_005_CA_1_evaluation HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 1 2011 NaN NaN NaN NaN 0 0 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 FOODS_3_823_WI_3_evaluation FOODS_3_823 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 2.980469
60034806 FOODS_3_824_WI_3_evaluation FOODS_3_824 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 2.480469
60034807 FOODS_3_825_WI_3_evaluation FOODS_3_825 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 3.980469
60034808 FOODS_3_826_WI_3_evaluation FOODS_3_826 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 1.280273
60034809 FOODS_3_827_WI_3_evaluation FOODS_3_827 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 6 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 1.000000

59882360 rows × 22 columns

In [14]:
group = sales.groupby(['state_id','store_id','cat_id','dept_id'],as_index=False)['item_id'].count().dropna()
group['USA'] = 'United States of America'
group.rename(columns={'state_id':'State','store_id':'Store','cat_id':'Category','dept_id':'Department','item_id':'Count'},inplace=True)
In [15]:
group_price_store = df.groupby(['state_id','store_id','item_id'],as_index=False)['sell_price'].mean().dropna()
In [16]:
group_price_cat = df.groupby(['store_id','cat_id','item_id'],as_index=False)['sell_price'].mean().dropna()
In [17]:
group = df.groupby(['year','date','state_id','store_id'], as_index=False)['sold'].sum().dropna()
In [18]:
fig = go.Figure()
title = 'Items sold over time'
years = group.year.unique().tolist()
buttons = []
y=3
for state in group.state_id.unique().tolist():
    group_state = group[group['state_id']==state]
    for store in group_state.store_id.unique().tolist():
        group_state_store = group_state[group_state['store_id']==store]
        fig.add_trace(go.Scatter(name=store, x=group_state_store['date'], y=group_state_store['sold'], showlegend=True, 
                                   yaxis='y'+str(y) if y!=1 else 'y'))
    y-=1

fig.update_layout(
        xaxis=dict(
        #autorange=True,
        range = ['2011-01-29','2016-05-22'],
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(count=2,
                     label="2y",
                     step="year",
                     stepmode="backward"),
                dict(count=3,
                     label="3y",
                     step="year",
                     stepmode="backward"),
                dict(count=4,
                     label="4y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            autorange=True,
        ),
        type="date"
    ),
    yaxis=dict(
        anchor="x",
        autorange=True,
        domain=[0, 0.33],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks="",
        title='WI',
        titlefont={"size":20},
        type="linear",
        zeroline=False
    ),
    yaxis2=dict(
        anchor="x",
        autorange=True,
        domain=[0.33, 0.66],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks="",
        title = 'TX',
        titlefont={"size":20},
        type="linear",
        zeroline=False
    ),
    yaxis3=dict(
        anchor="x",
        autorange=True,
        domain=[0.66, 1],
        mirror=True,
        showline=True,
        side="left",
        tickfont={"size":10},
        tickmode="auto",
        ticks='',
        title="CA",
        titlefont={"size":20},
        type="linear",
        zeroline=False
    )
    )
fig.update_layout(template='seaborn', title=title)
fig.show()
In [19]:
df['revenue'] = df['sold']*df['sell_price'].astype(np.float32)
In [20]:
df
Out[20]:
id item_id dept_id cat_id store_id state_id d sold date wm_yr_wk ... year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue
0 HOBBIES_1_001_CA_1_evaluation HOBBIES_1_001 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 2011 NaN NaN NaN NaN 0 0 0 NaN NaN
1 HOBBIES_1_002_CA_1_evaluation HOBBIES_1_002 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 2011 NaN NaN NaN NaN 0 0 0 NaN NaN
2 HOBBIES_1_003_CA_1_evaluation HOBBIES_1_003 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 2011 NaN NaN NaN NaN 0 0 0 NaN NaN
3 HOBBIES_1_004_CA_1_evaluation HOBBIES_1_004 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 2011 NaN NaN NaN NaN 0 0 0 NaN NaN
4 HOBBIES_1_005_CA_1_evaluation HOBBIES_1_005 HOBBIES_1 HOBBIES CA_1 CA d_1 0 2011-01-29 11101 ... 2011 NaN NaN NaN NaN 0 0 0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 FOODS_3_823_WI_3_evaluation FOODS_3_823 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 2.980469 0.0
60034806 FOODS_3_824_WI_3_evaluation FOODS_3_824 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 2.480469 0.0
60034807 FOODS_3_825_WI_3_evaluation FOODS_3_825 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 3.980469 0.0
60034808 FOODS_3_826_WI_3_evaluation FOODS_3_826 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 1.280273 0.0
60034809 FOODS_3_827_WI_3_evaluation FOODS_3_827 FOODS_3 FOODS WI_3 WI d_1969 0 2016-06-19 11621 ... 2016 NBAFinalsEnd Sporting Father's day Cultural 0 0 0 1.000000 0.0

59882360 rows × 23 columns

In [21]:
def introduce_nulls(df):
    idx = pd.date_range(df.date.dt.date.min(), df.date.dt.date.max())
    df = df.set_index('date')
    df = df.reindex(idx)
    df.reset_index(inplace=True)
    df.rename(columns={'index':'date'},inplace=True)
    return df

def plot_metric(df,state,store,metric):
    store_sales = df[(df['state_id']==state)&(df['store_id']==store)&(df['date']<='2016-05-22')]
    food_sales = store_sales[store_sales['cat_id']=='FOODS']
    store_sales = store_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
    snap_sales = store_sales[store_sales['snap_'+state]==1]
    non_snap_sales = store_sales[store_sales['snap_'+state]==0]
    food_sales = food_sales.groupby(['date','snap_'+state],as_index=False)['sold','revenue'].sum()
    snap_foods = food_sales[food_sales['snap_'+state]==1]
    non_snap_foods = food_sales[food_sales['snap_'+state]==0]
    non_snap_sales = introduce_nulls(non_snap_sales)
    snap_sales = introduce_nulls(snap_sales)
    non_snap_foods = introduce_nulls(non_snap_foods)
    snap_foods = introduce_nulls(snap_foods)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=non_snap_sales['date'],y=non_snap_sales[metric],
                           name='Total '+metric+'(Non-SNAP)'))
    fig.add_trace(go.Scatter(x=snap_sales['date'],y=snap_sales[metric],
                           name='Total '+metric+'(SNAP)'))
    fig.add_trace(go.Scatter(x=non_snap_foods['date'],y=non_snap_foods[metric],
                           name='Food '+metric+'(Non-SNAP)'))
    fig.add_trace(go.Scatter(x=snap_foods['date'],y=snap_foods[metric],
                           name='Food '+metric+'(SNAP)'))
    fig.update_yaxes(title_text='Total items sold' if metric=='sold' else 'Total revenue($)')
    fig.update_layout(template='seaborn',title=store)
    fig.update_layout(
        xaxis=dict(
        #autorange=True,
        range = ['2011-01-29','2016-05-22'],
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(count=2,
                     label="2y",
                     step="year",
                     stepmode="backward"),
                dict(count=3,
                     label="3y",
                     step="year",
                     stepmode="backward"),
                dict(count=4,
                     label="4y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            autorange=True,
        ),
        type="date"
    ))
    return fig
In [22]:
cal_data = group.copy()
cal_data = cal_data[cal_data.date <= '22-05-2016']
cal_data['week'] = cal_data.date.dt.weekofyear
cal_data['day_name'] = cal_data.date.dt.day_name()
In [23]:
def calmap(cal_data, state, store, scale):
    cal_data = cal_data[(cal_data['state_id']==state)&(cal_data['store_id']==store)]
    years = cal_data.year.unique().tolist()
    fig = make_subplots(rows=len(years),cols=1,shared_xaxes=True,vertical_spacing=0.005)
    r=1
    for year in years:
        data = cal_data[cal_data['year']==year]
        data = introduce_nulls(data)
        fig.add_trace(go.Heatmap(
            z=data.sold,
            x=data.week,
            y=data.day_name,
            hovertext=data.date.dt.date,
            coloraxis = "coloraxis",name=year,
        ),r,1)
        fig.update_yaxes(title_text=year,tickfont=dict(size=5),row = r,col = 1)
        r+=1
    fig.update_xaxes(range=[1,53],tickfont=dict(size=10), nticks=53)
    fig.update_layout(coloraxis = {'colorscale':scale})
    fig.update_layout(template='seaborn', title=store)
    return fig
In [24]:
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
In [25]:
del group, group_price_cat, group_price_store, group_state, group_state_store, cal_data
gc.collect();

#2
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
    if type.name == 'category':
        df[cols[i]] = df[cols[i]].cat.codes
        
#3
df.drop('date',axis=1,inplace=True)
In [26]:
df
Out[26]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday ... year event_name_1 event_type_1 event_name_2 event_type_2 snap_CA snap_TX snap_WI sell_price revenue
0 14370 1437 3 1 0 0 1 0 11101 2 ... 2011 -1 -1 -1 -1 0 0 0 NaN NaN
1 14380 1438 3 1 0 0 1 0 11101 2 ... 2011 -1 -1 -1 -1 0 0 0 NaN NaN
2 14390 1439 3 1 0 0 1 0 11101 2 ... 2011 -1 -1 -1 -1 0 0 0 NaN NaN
3 14400 1440 3 1 0 0 1 0 11101 2 ... 2011 -1 -1 -1 -1 0 0 0 NaN NaN
4 14410 1441 3 1 0 0 1 0 11101 2 ... 2011 -1 -1 -1 -1 0 0 0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 ... 2016 16 3 2 0 0 0 0 2.980469 0.0
60034806 14339 1433 2 0 9 2 1969 0 11621 3 ... 2016 16 3 2 0 0 0 0 2.480469 0.0
60034807 14349 1434 2 0 9 2 1969 0 11621 3 ... 2016 16 3 2 0 0 0 0 3.980469 0.0
60034808 14359 1435 2 0 9 2 1969 0 11621 3 ... 2016 16 3 2 0 0 0 0 1.280273 0.0
60034809 14369 1436 2 0 9 2 1969 0 11621 3 ... 2016 16 3 2 0 0 0 0 1.000000 0.0

59882360 rows × 22 columns

In [27]:
#Introduce lags
lags = [1,2,3,6,12,24,36]
for lag in lags:
    df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
In [28]:
df
Out[28]:
id item_id dept_id cat_id store_id state_id d sold wm_yr_wk weekday ... snap_WI sell_price revenue sold_lag_1 sold_lag_2 sold_lag_3 sold_lag_6 sold_lag_12 sold_lag_24 sold_lag_36
0 14370 1437 3 1 0 0 1 0 11101 2 ... 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 14380 1438 3 1 0 0 1 0 11101 2 ... 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 14390 1439 3 1 0 0 1 0 11101 2 ... 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 14400 1440 3 1 0 0 1 0 11101 2 ... 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 14410 1441 3 1 0 0 1 0 11101 2 ... 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
60034805 14329 1432 2 0 9 2 1969 0 11621 3 ... 0 2.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
60034806 14339 1433 2 0 9 2 1969 0 11621 3 ... 0 2.480469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
60034807 14349 1434 2 0 9 2 1969 0 11621 3 ... 0 3.980469 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
60034808 14359 1435 2 0 9 2 1969 0 11621 3 ... 0 1.280273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
60034809 14369 1436 2 0 9 2 1969 0 11621 3 ... 0 1.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0

59882360 rows × 29 columns

In [29]:
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
In [30]:
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
In [31]:
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
In [32]:
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
In [33]:
df = df[df['d']>=36]
In [34]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58815210 entries, 1067150 to 60034809
Data columns (total 44 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   id                        int16  
 1   item_id                   int16  
 2   dept_id                   int8   
 3   cat_id                    int8   
 4   store_id                  int8   
 5   state_id                  int8   
 6   d                         int16  
 7   sold                      int16  
 8   wm_yr_wk                  int16  
 9   weekday                   int8   
 10  wday                      int8   
 11  month                     int8   
 12  year                      int16  
 13  event_name_1              int8   
 14  event_type_1              int8   
 15  event_name_2              int8   
 16  event_type_2              int8   
 17  snap_CA                   int8   
 18  snap_TX                   int8   
 19  snap_WI                   int8   
 20  sell_price                float16
 21  revenue                   float32
 22  sold_lag_1                float16
 23  sold_lag_2                float16
 24  sold_lag_3                float16
 25  sold_lag_6                float16
 26  sold_lag_12               float16
 27  sold_lag_24               float16
 28  sold_lag_36               float16
 29  iteam_sold_avg            float16
 30  state_sold_avg            float16
 31  store_sold_avg            float16
 32  cat_sold_avg              float16
 33  dept_sold_avg             float16
 34  cat_dept_sold_avg         float16
 35  store_item_sold_avg       float16
 36  cat_item_sold_avg         float16
 37  dept_item_sold_avg        float16
 38  state_store_sold_avg      float16
 39  state_store_cat_sold_avg  float16
 40  store_cat_dept_sold_avg   float16
 41  rolling_sold_mean         float16
 42  expanding_sold_mean       float16
 43  selling_trend             float16
dtypes: float16(23), float32(1), int16(6), int8(14)
memory usage: 4.6 GB
In [ ]:
df  
In [36]:
# split data as training and test dataset
data = df
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test = data[data['d']>=1942][['id','d','sold']]
eval_preds = test['sold']
valid_preds = valid['sold']
In [37]:
#Get the store ids

stores = sales.store_id.cat.codes.unique().tolist()
for store in stores:
    df = data[data['store_id']==store]
    
    #Split the data
    X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
    X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
    X_test = df[df['d']>=1942].drop('sold',axis=1)
    
    #Train and validate
    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        num_leaves=50,
        min_child_weight=300
    )
    print('*****Prediction for Store: {}*****'.format(d_store_id[store]))
    model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
             eval_metric='rmse', verbose=20, early_stopping_rounds=20)
    valid_preds[X_valid.index] = model.predict(X_valid)
    eval_preds[X_test.index] = model.predict(X_test)
    

    
    
    
    filename = 'model'+str(d_store_id[store])+'.pkl'
    # save model
    joblib.dump(model, filename)
    del model, X_train, y_train, X_valid, y_valid
    gc.collect()
*****Prediction for Store: CA_1*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.655395	training's l2: 0.429543	valid_1's rmse: 0.399456	valid_1's l2: 0.159565
[40]	training's rmse: 0.613602	training's l2: 0.376508	valid_1's rmse: 0.381093	valid_1's l2: 0.145232
[60]	training's rmse: 0.588276	training's l2: 0.346069	valid_1's rmse: 0.36713	valid_1's l2: 0.134784
[80]	training's rmse: 0.569262	training's l2: 0.324059	valid_1's rmse: 0.360459	valid_1's l2: 0.12993
[100]	training's rmse: 0.556408	training's l2: 0.30959	valid_1's rmse: 0.356837	valid_1's l2: 0.127332
[120]	training's rmse: 0.545086	training's l2: 0.297119	valid_1's rmse: 0.351696	valid_1's l2: 0.12369
[140]	training's rmse: 0.536556	training's l2: 0.287892	valid_1's rmse: 0.346915	valid_1's l2: 0.12035
[160]	training's rmse: 0.528977	training's l2: 0.279816	valid_1's rmse: 0.347031	valid_1's l2: 0.120431
[180]	training's rmse: 0.522946	training's l2: 0.273472	valid_1's rmse: 0.342349	valid_1's l2: 0.117203
[200]	training's rmse: 0.516026	training's l2: 0.266283	valid_1's rmse: 0.342502	valid_1's l2: 0.117308
[220]	training's rmse: 0.507541	training's l2: 0.257598	valid_1's rmse: 0.338884	valid_1's l2: 0.114842
[240]	training's rmse: 0.502331	training's l2: 0.252337	valid_1's rmse: 0.335845	valid_1's l2: 0.112792
[260]	training's rmse: 0.495146	training's l2: 0.24517	valid_1's rmse: 0.335786	valid_1's l2: 0.112752
[280]	training's rmse: 0.489792	training's l2: 0.239897	valid_1's rmse: 0.332293	valid_1's l2: 0.110418
[300]	training's rmse: 0.483398	training's l2: 0.233674	valid_1's rmse: 0.328998	valid_1's l2: 0.10824
[320]	training's rmse: 0.479617	training's l2: 0.230032	valid_1's rmse: 0.328497	valid_1's l2: 0.10791
Early stopping, best iteration is:
[301]	training's rmse: 0.483004	training's l2: 0.233293	valid_1's rmse: 0.328392	valid_1's l2: 0.107841
*****Prediction for Store: CA_2*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.324541	training's l2: 0.105327	valid_1's rmse: 0.353024	valid_1's l2: 0.124626
[40]	training's rmse: 0.283049	training's l2: 0.0801166	valid_1's rmse: 0.3267	valid_1's l2: 0.106733
[60]	training's rmse: 0.266964	training's l2: 0.07127	valid_1's rmse: 0.323665	valid_1's l2: 0.104759
[80]	training's rmse: 0.257433	training's l2: 0.066272	valid_1's rmse: 0.320103	valid_1's l2: 0.102466
[100]	training's rmse: 0.250932	training's l2: 0.062967	valid_1's rmse: 0.321069	valid_1's l2: 0.103085
Early stopping, best iteration is:
[80]	training's rmse: 0.257433	training's l2: 0.066272	valid_1's rmse: 0.320103	valid_1's l2: 0.102466
*****Prediction for Store: CA_3*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 1.06722	training's l2: 1.13896	valid_1's rmse: 0.481448	valid_1's l2: 0.231792
[40]	training's rmse: 0.98982	training's l2: 0.979743	valid_1's rmse: 0.444244	valid_1's l2: 0.197353
[60]	training's rmse: 0.958507	training's l2: 0.918735	valid_1's rmse: 0.432059	valid_1's l2: 0.186675
[80]	training's rmse: 0.916068	training's l2: 0.839181	valid_1's rmse: 0.421529	valid_1's l2: 0.177687
[100]	training's rmse: 0.896462	training's l2: 0.803644	valid_1's rmse: 0.420345	valid_1's l2: 0.17669
Early stopping, best iteration is:
[86]	training's rmse: 0.908183	training's l2: 0.824797	valid_1's rmse: 0.413383	valid_1's l2: 0.170885
*****Prediction for Store: CA_4*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.22956	training's l2: 0.0526979	valid_1's rmse: 0.166991	valid_1's l2: 0.0278861
[40]	training's rmse: 0.204373	training's l2: 0.0417683	valid_1's rmse: 0.141521	valid_1's l2: 0.0200281
[60]	training's rmse: 0.196442	training's l2: 0.0385894	valid_1's rmse: 0.139554	valid_1's l2: 0.0194754
[80]	training's rmse: 0.190206	training's l2: 0.0361782	valid_1's rmse: 0.13555	valid_1's l2: 0.0183737
[100]	training's rmse: 0.185225	training's l2: 0.0343082	valid_1's rmse: 0.131245	valid_1's l2: 0.0172254
[120]	training's rmse: 0.181625	training's l2: 0.0329875	valid_1's rmse: 0.131652	valid_1's l2: 0.0173324
Early stopping, best iteration is:
[106]	training's rmse: 0.183939	training's l2: 0.0338335	valid_1's rmse: 0.130739	valid_1's l2: 0.0170926
*****Prediction for Store: TX_1*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.610293	training's l2: 0.372458	valid_1's rmse: 0.300079	valid_1's l2: 0.0900475
[40]	training's rmse: 0.5684	training's l2: 0.323078	valid_1's rmse: 0.2878	valid_1's l2: 0.0828289
[60]	training's rmse: 0.544584	training's l2: 0.296571	valid_1's rmse: 0.280895	valid_1's l2: 0.0789021
[80]	training's rmse: 0.530704	training's l2: 0.281646	valid_1's rmse: 0.276548	valid_1's l2: 0.0764786
[100]	training's rmse: 0.517769	training's l2: 0.268084	valid_1's rmse: 0.272017	valid_1's l2: 0.0739931
[120]	training's rmse: 0.503286	training's l2: 0.253296	valid_1's rmse: 0.26837	valid_1's l2: 0.0720227
[140]	training's rmse: 0.493049	training's l2: 0.243098	valid_1's rmse: 0.268564	valid_1's l2: 0.0721269
[160]	training's rmse: 0.485071	training's l2: 0.235294	valid_1's rmse: 0.266344	valid_1's l2: 0.0709393
[180]	training's rmse: 0.476507	training's l2: 0.227059	valid_1's rmse: 0.263231	valid_1's l2: 0.0692907
[200]	training's rmse: 0.470587	training's l2: 0.221452	valid_1's rmse: 0.262352	valid_1's l2: 0.0688288
[220]	training's rmse: 0.463197	training's l2: 0.214551	valid_1's rmse: 0.261722	valid_1's l2: 0.0684985
Early stopping, best iteration is:
[214]	training's rmse: 0.465419	training's l2: 0.216615	valid_1's rmse: 0.261128	valid_1's l2: 0.068188
*****Prediction for Store: TX_2*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.715718	training's l2: 0.512252	valid_1's rmse: 0.370149	valid_1's l2: 0.13701
[40]	training's rmse: 0.657974	training's l2: 0.432929	valid_1's rmse: 0.339805	valid_1's l2: 0.115467
[60]	training's rmse: 0.625401	training's l2: 0.391126	valid_1's rmse: 0.336939	valid_1's l2: 0.113528
Early stopping, best iteration is:
[58]	training's rmse: 0.629219	training's l2: 0.395916	valid_1's rmse: 0.334453	valid_1's l2: 0.111859
*****Prediction for Store: TX_3*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.527135	training's l2: 0.277871	valid_1's rmse: 0.384117	valid_1's l2: 0.147546
[40]	training's rmse: 0.489525	training's l2: 0.239634	valid_1's rmse: 0.370173	valid_1's l2: 0.137028
[60]	training's rmse: 0.471246	training's l2: 0.222072	valid_1's rmse: 0.364636	valid_1's l2: 0.13296
[80]	training's rmse: 0.459737	training's l2: 0.211358	valid_1's rmse: 0.363232	valid_1's l2: 0.131937
[100]	training's rmse: 0.453257	training's l2: 0.205442	valid_1's rmse: 0.359872	valid_1's l2: 0.129508
[120]	training's rmse: 0.445042	training's l2: 0.198062	valid_1's rmse: 0.357586	valid_1's l2: 0.127867
Early stopping, best iteration is:
[117]	training's rmse: 0.445664	training's l2: 0.198616	valid_1's rmse: 0.356806	valid_1's l2: 0.12731
*****Prediction for Store: WI_1*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.239492	training's l2: 0.0573566	valid_1's rmse: 0.222231	valid_1's l2: 0.0493866
[40]	training's rmse: 0.214234	training's l2: 0.0458963	valid_1's rmse: 0.200376	valid_1's l2: 0.0401505
[60]	training's rmse: 0.200825	training's l2: 0.0403305	valid_1's rmse: 0.191366	valid_1's l2: 0.0366208
[80]	training's rmse: 0.193811	training's l2: 0.0375628	valid_1's rmse: 0.188034	valid_1's l2: 0.0353568
[100]	training's rmse: 0.18784	training's l2: 0.0352839	valid_1's rmse: 0.182801	valid_1's l2: 0.0334164
[120]	training's rmse: 0.183796	training's l2: 0.033781	valid_1's rmse: 0.178891	valid_1's l2: 0.0320021
[140]	training's rmse: 0.180941	training's l2: 0.0327397	valid_1's rmse: 0.179018	valid_1's l2: 0.0320476
Early stopping, best iteration is:
[124]	training's rmse: 0.182998	training's l2: 0.0334884	valid_1's rmse: 0.178823	valid_1's l2: 0.0319778
*****Prediction for Store: WI_2*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.436082	training's l2: 0.190168	valid_1's rmse: 0.59221	valid_1's l2: 0.350713
[40]	training's rmse: 0.378962	training's l2: 0.143612	valid_1's rmse: 0.541682	valid_1's l2: 0.29342
[60]	training's rmse: 0.35272	training's l2: 0.124411	valid_1's rmse: 0.515769	valid_1's l2: 0.266017
[80]	training's rmse: 0.339776	training's l2: 0.115448	valid_1's rmse: 0.509014	valid_1's l2: 0.259095
[100]	training's rmse: 0.326636	training's l2: 0.106691	valid_1's rmse: 0.501688	valid_1's l2: 0.251691
[120]	training's rmse: 0.314601	training's l2: 0.0989739	valid_1's rmse: 0.493295	valid_1's l2: 0.24334
[140]	training's rmse: 0.305384	training's l2: 0.0932596	valid_1's rmse: 0.488311	valid_1's l2: 0.238448
[160]	training's rmse: 0.299132	training's l2: 0.0894802	valid_1's rmse: 0.484575	valid_1's l2: 0.234813
[180]	training's rmse: 0.293837	training's l2: 0.0863404	valid_1's rmse: 0.482328	valid_1's l2: 0.23264
[200]	training's rmse: 0.28581	training's l2: 0.0816875	valid_1's rmse: 0.478139	valid_1's l2: 0.228617
[220]	training's rmse: 0.282042	training's l2: 0.0795477	valid_1's rmse: 0.478197	valid_1's l2: 0.228672
Early stopping, best iteration is:
[202]	training's rmse: 0.285476	training's l2: 0.0814968	valid_1's rmse: 0.477121	valid_1's l2: 0.227644
*****Prediction for Store: WI_3*****
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 0.518431	training's l2: 0.268771	valid_1's rmse: 0.3779	valid_1's l2: 0.142809
[40]	training's rmse: 0.487469	training's l2: 0.237626	valid_1's rmse: 0.352151	valid_1's l2: 0.124011
[60]	training's rmse: 0.464214	training's l2: 0.215494	valid_1's rmse: 0.334132	valid_1's l2: 0.111644
[80]	training's rmse: 0.449917	training's l2: 0.202426	valid_1's rmse: 0.326565	valid_1's l2: 0.106645
Early stopping, best iteration is:
[78]	training's rmse: 0.450978	training's l2: 0.203381	valid_1's rmse: 0.32644	valid_1's l2: 0.106563
In [40]:
feature_importance_df = pd.DataFrame()
features = [f for f in data.columns if f != 'sold']
for filename in os.listdir('/Users/yanminglai'):
    if 'model' in filename:
        # load model
        model = joblib.load(filename)
        store_importance_df = pd.DataFrame()
        store_importance_df["feature"] = features
        store_importance_df["importance"] = model.feature_importances_
        store_importance_df["store"] = filename[5:9]
        feature_importance_df = pd.concat([feature_importance_df, store_importance_df], axis=0)
    
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:20].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (averaged over store predictions)')
    plt.tight_layout()
    
display_importances(feature_importance_df)
In [41]:
#Set actual equal to false if you want to top in the public leaderboard :P
actual = False
if actual == False:
    #Get the validation results(We already have them as less than one month left for competition to end)
    validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
    validation['id']=pd.read_csv('sales_train_validation.csv').id
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
    #Get the actual validation results
    valid['sold'] = valid_preds
    validation = valid[['id','d','sold']]
    validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
    validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
    validation.id = validation.id.map(d_id).str.replace('evaluation','validation')

#Get the evaluation results
test['sold'] = eval_preds
evaluation = test[['id','d','sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
#Remap the category id to their respective categories
evaluation.id = evaluation.id.map(d_id)

#Prepare the submission
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('submissionfirstime.csv',index=False)

Discrimination -- SMD for continuous values

In [44]:
validation.to_csv('validation.csv')
In [46]:
import csv
key_words = [ 

 ]

listname = 'validation_result'
listname_accept = (listname) + '.csv'
listname_rejects = (listname) + '_rejected.csv'

with open('validation.csv') as oldfile, open(listname_accept,'w') as cleaned, open(listname_rejects,'w') as matched:
    accept_writer=csv.writer(cleaned) # create one csv writer object
    reject_writer=csv.writer(matched) #
    initial_reader=csv.reader(oldfile)
    foods_count = foods_num = hobbies_count = hobbies_num = household_count = household_num = 0
    CA_count = CA_num = TX_count = TX_num = WI_num = WI_count = 0

    for c,row in enumerate(initial_reader): 
        if c==0:                            # first row is the header
            header=row[:]
            del header[0]       # delete original header
            header[1:0]=['Department','DepartmentID','ItemID','State','StoreID'] # insert these column names
            accept_writer.writerow(header)                
            reject_writer.writerow(header)                  
        else:                                               # for all other input rows, except the first
            address_list=[i.strip() for i in row[1].split('_')] 
            all_address = address_list
            del all_address[-1]
            #print(all_address)
            del row[1]                                          
            row[1:1]=all_address 
            #calculation-----------------------------------------------
            if(row[1] == 'FOODS'):
            	foods_count += 1
            	foods_num += float(row[6])
            if(row[1] == 'HOBBIES'):
            	hobbies_count += 1
            	hobbies_num += float(row[6])
            if(row[1] == 'HOUSEHOLD'):
            	household_count += 1
            	household_num += float(row[6])
            if(row[4] == 'CA'):
            	CA_count += 1
            	CA_num += float(row[6])
            if(row[4] == 'TX'):
            	TX_count += 1
            	TX_num += float(row[6])
            if(row[4] == 'WI'):
            	WI_count += 1
            	WI_num += float(row[6])

            

            #calculation-----------------------------------------------                         
            if row[0] not in key_words:                            # test if name in key_words
                accept_writer.writerow(row)
            else:
                reject_writer.writerow(row)

    print("FOODS mean = ", foods_num / foods_count)
    print("HOBBIES mean = ", hobbies_num / hobbies_count)
    print("HOUSEHOLD mean = ", household_num / household_count)
    print("CA mean = ", CA_num / CA_count)
    print("TX mean = ", TX_num / TX_count)
    print("WI mean = ", WI_num / WI_count)
FOODS mean =  1.806054279749478
HOBBIES mean =  0.6693805309734513
HOUSEHOLD mean =  0.8651384909264566
CA mean =  1.4368645457527058
TX mean =  1.1656280747786159
WI mean =  1.1596151743741117
In [47]:
result=pd.read_csv('validation_result.csv')

smd among states

In [48]:
# Since we have one reference group and two protected groups here, 
# we need to calculate two SMD between one reference group and one protected group 

df = result.where(result["State"]!="WI").dropna() #filter
df2 = result.where(result['State']!="TX").dropna() 

sd2 = df.loc[:,"F1"].std() #standard deviation of the prediction on the next day
sd3 = df2.loc[:,"F1"].std() 

# Calculate SMD respectively 
SMD_1 = (CA_num / CA_count - TX_num / TX_count) / sd2
SMD_2 = (CA_num / CA_count - WI_num / WI_count) / sd3
print("SMD between CA and TX", SMD_1)
print("SMD between CA and WI", SMD_2)
SMD between CA and TX 0.08139434324410068
SMD between CA and WI 0.08538665411573497

There is barely discrimination among states because the difference is quite small, below the 0.2 threshold for a small difference. SMD also points to low disparity among states in this model.

SMD among departments

In [49]:
#We have three broad departments including food, household, and hobbies.
#Hobbies is the reference group, and the rest two are protected groups
df3 = result.where(result['Department'] !="Food").dropna()
df4 = result.where(result['Department'] != "Household").dropna()

#Calculate SD for prediction on the next day
sd4 = df3.loc[:,"F1"].std()
sd5 = df4.loc[:,"F1"].std()


SMD_3 = (household_num / household_count - hobbies_num / hobbies_count) / sd4
SMD_4 = (foods_num / foods_count - hobbies_num / hobbies_count) / sd5
print("SMD between household and hobbies departments", SMD_3)
print("SMD between foods and hobbies departments", SMD_4)
SMD between household and hobbies departments 0.06046404396350667
SMD between foods and hobbies departments 0.35108606329294

There is little discrimination between household and hobbies departments because the difference is quite small, below than 0.2. Foods department receive a higher average probability of amount of sales than do hobbies department. We claim that there is a small discrimination between food and hobbies departments due to between 0.2 and 0.5 threshold for a small difference.

In [ ]: